********************************************************************************
********************************************************************************
* Prepare pollution/weather/population at different levels of aggregation
********************************************************************************
********************************************************************************

clear all

set maxvar 100000
set more off
capture log close
capture restore

global tmp "d:\tmp\PollutionHealth"
capture !mkdir $tmp



capture program drop location_heat_dist_urban
program define       location_heat_dist_urban
	use monitor_dist, clear
	merge 1:1 StationID using monitors_1628, assert(2 3) keepusing(StationLon StationLat heat_real heat_qinhuai_new dist_real dist_qinhuai_new urban)
	tab StationID if _merge==2	//10 monitors are StationID = 1022A,1028A,1202A,1238A,1254A,1289A,1347A,1777A,1892A,1946A
	keep StationID StationLon StationLat heat_real heat_qinhuai_new dist_real dist_qinhuai_new urban
	save $tmp\monitor_heat_dist_urban, replace			//84% of 1,628 monitors are urban
	
	use town_dist, replace
	gen temp = substr(town, -3, .)
	gen urban = (temp ~= "乡" & temp ~= "镇" & temp ~= "林" & temp ~= "场")	//22% of 43,224 towns are urban
	keep town_code                       heat_real heat_qinhuai_new dist_real dist_qinhuai_new urban
	save $tmp\town_code_heat_dist_urban, replace

	use county_dist, replace
	gen urban=.											//Generate urban for consistency
	keep county_code                     heat_real heat_qinhuai_new dist_real dist_qinhuai_new urban
	save $tmp\county_code_heat_dist_urban, replace
end



capture program drop pm_monitor_year_month
program define       pm_monitor_year_month
	use monitor_daily_2013_2014, clear
	tab month year										//Monitor-date observations from Jan 2013 to May 2014
	gen date = mdy(month, day, year)
	isid StationID date									//StationID date are unique identifiers
	foreach var in areaname positionname {				//Monitor characteristics are invariant within StationID
		bys StationID: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	foreach var in latitude longitude {
		bys StationID: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	renvars aqi pm25 o3 pm10 so2 no2 co latitude longitude, postfix(_zp)	//Data from Peng Zhang
	unique StationID									//942 monitors
	save $tmp\monitor_date_2013_2014, replace
	
	use monitor_daily_2014_2018, clear
	tab month year										//Monitor-date observations from May 2014 to Dec 2018; 22Dec2018 - 26Dec2018 missing
	gen date = mdy(month, day, year)
	isid StationID date									//StationID date are unique identifiers
	foreach var in StationLon StationLat StationName StationCity {		//Monitor characteristics are invariant within StationID
		bys StationID: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	unique StationID									//1608 monitors

	merge 1:1 StationID date using $tmp\monitor_date_2013_2014      //Merge in Jan 2013 to May 2014
	erase $tmp\monitor_date_2013_2014.dta
	unique StationID									//942 using-data (up to May 2014) monitors + 1608 master-data (after May 2014) monitors -> 1608 combined monitors; confirms that 942 are contained in the 1608
	format date %td
	tab date if _merge==3
	order aqi_zp PM10 AQI pm25_zp pm25 o3_zp O3 O3_8h pm10_zp so2_zp SO2 no2_zp NO2 co_zp CO, after(longitude_zp)
	summ  aqi_zp PM10 AQI pm25_zp pm25 o3_zp O3 O3_8h pm10_zp so2_zp SO2 no2_zp NO2 co_zp CO if _merge==3
	summ StationLon StationLat latitude_zp longitude_zp if _merge==3 & StationLon~=. & StationLat~=. & latitude_zp~=. & longitude_zp~=.
	replace PM10    =pm10_zp if PM10    ==.				//Favor particle levels in the master data (after May 2014) on the five overlapping dates (they are close but not the same as particle levels in the using data)
	replace pm25    =pm25_zp if pm25    ==.
	bysort StationID (date): replace StationName=StationName[_N] if StationName==""	//Using data (up to May 2014) is missing StationName & StationCity
	bysort StationID (date): replace StationCity=StationCity[_N] if StationCity==""	
	bysort StationID (date): replace StationLon=StationLon[_N] if StationLon==.		//Take coordinates in the May 2014 - Dec 2018 data for use in the Jan 2013 - May 2014 period
	bysort StationID (date): replace StationLat=StationLat[_N] if StationLat==.
	*assert StationName~="" & StationCity~=""			//17 monitors are missing StationName & StationCity even in the master data (after May 2014); StationName is completed when merging town_nearestMonitor_year_month.dta below
	tab month year             if pm25~=. & (StationName=="" | StationCity=="")		//16 monitors exit in 2015; we lack information on their coordinates and cannot calculate distances to the boundaries 
	tab StationID              if pm25~=. & (StationName=="" | StationCity=="")
	summ StationLon StationLat if pm25~=. & (StationName=="" | StationCity=="")
	bysort StationID (date): replace StationLon=longitude_zp[1] if StationLon==.	//Take coordinates from the Jan 2013 - May 2014 data when these are missing in the May 2014 - Dec 2018 data 
	bysort StationID (date): replace StationLat=latitude_zp[1]  if StationLat==.
	tab StationID if StationLon==.| StationLat==.									//Only 7 monitors now have missing coordinates

	collapse (count) numDays_pm25_YM=pm25 (mean) StationLon StationLat PM10 pm25, by(StationID year month)
	assert numDays_pm25_YM==0 if pm25==.
	assert pm25==. if numDays_pm25_YM==0
	
	unique StationID									//1608 monitors throughout
	forvalue  i = 2013/2018 {							//Monitors increase from 872 in 2013 & 945 in 2014 to 1497 in 2015 & 1605 in 2018
		unique StationID if year==`i'
		}
	save $tmp\pm_monitor_year_month, replace
	
	foreach var in StationLon StationLat {				//Monitor coordinates are invariant within StationID
		bys StationID: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	preserve
		bysort StationID: keep if _n==1
		keep StationID StationLon StationLat
		save $tmp\pm_monitor_coordinates, replace
	restore
	
	collapse (sum) numDays_pm25_YM (mean) StationLon StationLat, by(StationID year)		//Find the list of active monitors, with coordinates, by year
	keep if numDays_pm25_YM>270 & numDays_pm25_YM~=.	//We consider a monitor-year active if the number of daily readings exceeds 270 (note: numDays_pm25_YM is never missing, but for reassurance allow for such a situation in the condition)
	unique StationID    								//The number of active monitors (in any year) drops from 1608 to 1597 when restricting to > 270 daily readings in a year
	forvalue  i = 2013/2018 {							//To find the 4 nearest monitors to each county among monitors that are active each year
		preserve
			keep if year==`i'
			unique StationID   							//Active monitors by year: 2013: 452, 2014: 929, 2015: 1453, 2016: 1440, 2017: 1465, 2018: 1479 
			save $tmp\active_monitors_`i', replace 
		restore
		}
end



capture program drop pm_monitor_describe
program define       pm_monitor_describe
	use $tmp\pm_monitor_year_month, clear
	gen yyyymm=year*100+month
	
	tab yyyymm if PM10~=.								//Reported in the text
	tab yyyymm if pm25~=.								//454 monitors with non-missing 1-month PM2.5 means in 201301 (457 in 201302). There are jumps in availability in late 2013 and again in 201501
	tab yyyymm if pm25~=. & numDays_pm25_YM>10			//403 monitors with non-missing 1-month PM2.5 means (based on more than 10 24-hour PM2.5 means) in 201301 (449 in 201302)
	tab yyyymm if pm25~=. & numDays_pm25_YM>20			//0 monitor with non-missing 1-month PM2.5 means (based on more than 20 24-hour PM2.5 means) in 201301 (436 in 201302)

	areg pm25 i.month i.year, a(StationID) cluster(yyyymm)		//Partial out monitor FE and month FE to examine year-on-year PM
	egen monitor_month=group(StationID month)
	areg pm25 i.year, a(monitor_month) cluster(yyyymm)			//Partial out monitor-month FE to examine year-on-year PM: Very similar
	
	areg PM10 i.month, a(StationID) cluster(yyyymm)				//Restricting only to active monitors before partialing out monitor and month (not the case currently) produces a very similar figure
	predict PM10_res if e(sample), residual
	areg pm25 i.month, a(StationID) cluster(yyyymm)
	predict pm25_res if e(sample), residual
	
	*Generate Fig. A2
	preserve
		collapse (mean) PM10 pm25 PM10_res pm25_res (sum) numDays_pm25_YM, by(StationID year)
		keep if numDays_pm25_YM>270 & numDays_pm25_YM~=.	//We consider a monitor-year active if the number of daily readings exceeds 270 (note: numDays_pm25_YM is never missing, but for reassurance allow for such a situation in the condition)
		collapse (mean) PM10 pm25 PM10_res pm25_res                      , by(year)
		twoway scatter PM10 year, msize(large) mcolor(olive) mstyle(p1) || scatter pm25 year, msize(medium) mcolor(blue) mstyle(p1) /// 
			legend(off) xlabel(, labsize(small) angle(0)) xline(0) ///
			ylabel(0(30)120, labsize(small) angle(0)) ytitle("Mean over monitors and months, by year ({&mu}g/m{superscript:3})", size(medsmall)) ///
			xtitle("Year", size(medsmall)) scheme(s1manual)			
		twoway scatter PM10_res year, msize(large) mcolor(olive) mstyle(p1) || scatter pm25_res year, msize(medium) mcolor(blue) mstyle(p1) /// 
			legend(off) xlabel(, labsize(small) angle(0)) xline(0) ///
			ylabel(, labsize(small) angle(0)) ytitle("Mean residual over monitors and months ({&mu}g/m{superscript:3})", size(medsmall)) ///
			xtitle("Year", size(medsmall)) scheme(s1manual)	
	restore
end    



*Town-year-month level PM using the 4 nearest monitors weighted by inverse distance, from Jan 2013 to Dec 2018
capture program drop pm_4within50kmTown_year_month
program define       pm_4within50kmTown_year_month
	use town_code_coordinates, clear
	
	forvalues i = 2013/2018 {								//To find the 4 nearest monitors to each town among monitors that are active each year
		preserve
			geonear town_code town_lat town_lng using $tmp\active_monitors_`i', neighbors(StationID StationLat StationLon) nearcount(4) long   //Distance from each town centroid to the 4 nearest monitors active that year
			ren km_to_StationID dist_town2monitor
			unique StationID								//Monitors qualifying as one of 4 nearest by year: 2013: 452, 2014: 929, 2015: 1453, 2016: 1440, 2017: 1465, 2018: 1479  
			summ dist_town2monitor							//Mean distance (km) by year:                      2013: 164, 2014: 108, 2015:   59, 2016:   58, 2017:   59, 2018:   58
			bysort town_code (dist_town2monitor): gen near_rank=_n 
			gen year=`i'
			save $tmp\town2activeMonitors_`i', replace
		restore
		}
	use $tmp\town2activeMonitors_2013, clear
	forvalues i = 2014/2018 {
		append using $tmp\town2activeMonitors_`i'
		erase $tmp\town2activeMonitors_`i'.dta
		}
	erase $tmp\town2activeMonitors_2013.dta

	expand 12												//Jan - Dec = 12 months
	bysort town_code StationID year: gen month = _n
	tab month year
	save $tmp\town_4nearestMonitor_year_month, replace

	use $tmp\pm_monitor_year_month, clear
	merge 1:m StationID year month using $tmp\town_4nearestMonitor_year_month, keep(2 3)		//One StationID-year-month observation merged to many town_code-StationID-year-month observations for which StationID is one of the the 4 nearest monitors
	erase $tmp\town_4nearestMonitor_year_month.dta        	//18,150 town_code-StationID-year-month quadruples with inactive monitoring (relative to 12,409,914 active quadruples); this is low given the restriction above to monitors with > 270 daily readings in a year
	unique StationID if _merge==2							//This happens at 83 monitors (out of the pool of 1597 monitors that are among the 4 nearest to at least one town)
	replace numDays_pm25_YM=0 if _merge==2
	isid town_code StationID year month 					//town_code StationID year month are unique identifiers
	************************************************		
	*drop if dist_town2monitor>25							//Robustness: monitors at most 25 km away
	drop if dist_town2monitor>50							//Allow monitors at most 50 km away to contribute to a town's inverse-distance weighted PM measure
	unique town_code										//The number of towns with at least one year-month observation (at any point during the time period) drops to 26,942 (or 14,618 if require monitors at most 25 km away)
	bysort town_code year month: gen num_near=_N
	forvalue  i = 1/4 {										//The number of townships with 1, 2, 3, 4 active monitor(s) within 50 km (at any point in time) is 2975, 4608, 4782, 18,466, respectively. (Note: the parentheses explain why the sum > 26,942.)
		unique town_code if num_near==`i'
		}
	unique town_code if year==2015 & (pm25~=.|PM10~=.)		//Reported in the text
	************************************************
	keep  town_code StationID dist_town2monitor year month numDays_pm25_YM PM10 pm25
	order town_code StationID dist_town2monitor year month numDays_pm25_YM PM10 pm25
	bysort year: summ numDays_pm25_YM PM10 pm25				//Particle levels decrease monotonically from 2013 to 2018
	
	*Take the weighted average across the 4 nearest monitors using the inverse square distance as weights
	foreach var of varlist PM10 pm25 {
		gen weight_`var' = dist_town2monitor^(-2) if `var'~=.
		*gen weight_`var' = dist_town2monitor^(-1) if `var'~=.		//Robustness using ^(-1)
		replace    `var' = `var'*weight_`var'
		}
	collapse (max) numDays_pm25_YM (sum) PM10 pm25 weight_PM10 weight_pm25, by(town_code year month)
	foreach var of varlist PM10 pm25 {
		replace `var' = `var'/weight_`var'
		ren `var' `var'_near4
		drop weight_`var'
		}	
	assert pm25==.            if numDays_pm25_YM==0
	assert numDays_pm25_YM==0 if pm25==.

	save $tmp\pm_4within50kmTown_year_month, replace		//1.6 million non-missing one-month means (town_code-year-month triples)
end 



capture program drop pm_town_describe
program define       pm_town_describe
	use $tmp\pm_4within50kmTown_year_month, clear
	gen yyyymm=year*100+month
	tab yyyymm if pm25~=.									//9728 towns with non-missing 1-month PM2.5 means in 201301. There are jumps in availability in 201401 and again in 201501
	tab yyyymm if pm25~=. & numDays_pm25_YM>10				//9081 towns with non-missing 1-month PM2.5 means (based on more than 10 24-hour PM2.5 means in at least one of the 4 nearest monitors) in 201301
	ciplot PM10 pm25, by(yyyymm) legend(order(2 "PM10" 3 "PM2.5")) ytitle("1-mo mean of 24-h means of 1-h means ({&mu}g m{superscript:-3})") xtitle("Year-month") xlab(, angle(90) labsize(vsmall)) note("")

	areg pm25 i.month i.year, a(town_code) cluster(yyyymm)	//Partial out town FE and month FE to examine year-on-year PM
	egen town_month=group(town_code month)
	areg pm25 i.year, a(town_month) cluster(yyyymm)			//Partial out town-month FE to examine year-on-year PM: Very similar
end    



*County-year-month level PM using 4 nearest monitor weighted by inverse distance, from Jan 2013 to Dec 2018
capture program drop pm_4within50kmCounty_year_month
program define       pm_4within50kmCounty_year_month
	use county_code_coordinates, clear
	
	forvalues i = 2013/2018 {								//To find the 4 nearest monitors to each county among monitors that are active each year
		preserve
			geonear county_code county_lat county_lng using $tmp\active_monitors_`i', neighbors(StationID StationLat StationLon) nearcount(4) long   //Distance from each county centroid to the 4 nearest monitors active that year
			ren km_to_StationID dist_county2monitor
			unique StationID								//Monitors qualifying as one of 4 nearest by year: 2013: 452, 2014: 929, 2015: 1453, 2016: 1440, 2017: 1465, 2018: 1479 
			summ dist_county2monitor						//Mean distance (km) by year:                      2013: 168, 2014: 112, 2015:   63, 2016:   62, 2017:   63, 2018:   62
			bysort county_code (dist_county2monitor): gen near_rank=_n 
			gen year=`i'
			save $tmp\county2activeMonitors_`i', replace
		restore
		}
	use $tmp\county2activeMonitors_2013, clear
	forvalues i = 2014/2018 {
		append using $tmp\county2activeMonitors_`i'
		erase $tmp\county2activeMonitors_`i'.dta
		}
	erase $tmp\county2activeMonitors_2013.dta
	
	expand 12												//Jan - Dec = 12 months
	bysort county_code StationID year: gen month = _n
	tab month year
	save $tmp\county_4nearestMonitor_year_month, replace

	use $tmp\pm_monitor_year_month, clear
	merge 1:m StationID year month using $tmp\county_4nearestMonitor_year_month, keep(2 3)		//One StationID-year-month observation merged to many county_code-StationID-year-month observations for which StationID is one of the the 4 nearest monitors
	erase $tmp\county_4nearestMonitor_year_month.dta        //1199 county_code-StationID-year-month quadruples with inactive monitoring (relative to 816,433 active quadruples); this is low given the restriction above to monitors with > 270 daily readings in a year
	unique StationID if _merge==2							//This happens at 83 monitors (out of the pool of 1597 monitors that are among the 4 nearest to at least one county)
	replace numDays_pm25_YM=0 if _merge==2
	isid county_code StationID year month 					//county_code StationID year month are unique identifiers
	************************************************		
	*drop if dist_county2monitor>25							//Robustness: monitors at most 25 km away
	drop if dist_county2monitor>50							//Allow monitors at most 50 km away to contribute to a county's inverse-distance weighted PM measure
	unique county_code										//The number of counties with at least one year-month observation (at any point during the time period) drops from 2839 to 1777 (or 1014 if require monitors at most 25 km away)
	bysort county_code year month: gen num_near=_N
	forvalue  i = 1/4 {										//The number of counties with 1, 2, 3, 4 active monitor(s) within 50 km (at any point in time) is 198, 297, 303, 1229, respectively. (Note: the parentheses explains why 229+314+358+1278>1781.)
		unique county_code if num_near==`i'
		}
	unique county_code if year==2015 & (pm25~=.|PM10~=.)	//Reported in the text
	************************************************
	keep  county_code StationID dist_county2monitor year month numDays_pm25_YM PM10 pm25
	order county_code StationID dist_county2monitor year month numDays_pm25_YM PM10 pm25
	bysort year: summ numDays_pm25_YM PM10 pm25				//Particle levels decrease monotonically from 2013 to 2018
	
	*Take the weighted average across the 4 nearest monitors using the inverse square distance as weights
	foreach var of varlist PM10 pm25 {
		gen weight_`var' = dist_county2monitor^(-2) if `var'~=.			
		*gen weight_`var' = dist_county2monitor^(-1) if `var'~=.		//Robustness using ^(-1)
		replace    `var' = `var'*weight_`var'
		}
	collapse (max) numDays_pm25_YM (sum) PM10 pm25 weight_PM10 weight_pm25, by(county_code year month)
	foreach var of varlist PM10 pm25 {
		replace `var' = `var'/weight_`var'
		ren `var' `var'_near4
		drop weight_`var'
		}	
	assert pm25==.            if numDays_pm25_YM==0
	assert numDays_pm25_YM==0 if pm25==.

	save $tmp\pm_4within50kmCounty_year_month, replace		//107,332 non-missing one-month means (county_code-year-month triples)
end 



capture program drop W_station_year_month
program define       W_station_year_month
	use gsod_china_2013_2018, clear
	gen year  = int( yearmoda/10000)
	gen month = int((yearmoda - year*10000)/100)
	gen day   = int( yearmoda - year*10000 - month*100)
	gen date  = mdy(month, day, year)
	format date %td
	drop yearmoda
	tab month year										//Weather station-date observations from Jan 2013 to Dec 2018
	isid usaf date										//usaf date are unique identifiers
	renvars lat lon, prefix(usaf_)
	foreach var in usaf_lat usaf_lon {					//Weather station characteristics are invariant within usaf
		bys usaf: assert `var'[_n]==`var'[_n-1] if _n>1
		}

	collapse (count) numDays_temp_YM=tem_mean (mean) usaf_lat usaf_lon temp=tem_mean prcp, by(usaf year month)
	
	unique usaf
	forvalue  i = 2013/2018 {							//Number of weather stations is stable over the years
		unique usaf if year==`i'
		}
	save $tmp\W_station_year_month, replace
	
	collapse (sum) numDays_temp_YM (mean) usaf_lon usaf_lat, by(usaf year)	//Find the list of active NOAA stations, with coordinates, by year
	keep if numDays_temp_YM>270 & numDays_temp_YM~=.		//We consider a station-year active if the number of daily readings exceeds 270 (note: numDays_temp_YM is never missing, but for reassurance allow for such a situation in the condition)
	unique usaf												//The number of active stations (in any year) drops from 409 to 400 when restricting to > 270 daily readings in a year
	forvalue  i = 2013/2018 {								//To find the 4 nearest stations to each county among stations that are active each year
		preserve
			keep if year==`i'
			unique usaf										//Active stations by year: 2013: 396, 2014-2015: 397, 2016: 399, 2017: 396, 2018: 395 
			save $tmp\active_stations_`i', replace
		restore
		}
		
	bys usaf: keep if _n==1
	keep usaf usaf_lat usaf_lon
	save $tmp\W_station_coordinates, replace
end



capture program drop W_4within200kmTown_year_month
program define       W_4within200kmTown_year_month
    use town_code_coordinates, clear
	
	forvalues i = 2013/2018 {								//To find the 4 nearest stations to each town among stations that are active each year
		preserve
			geonear town_code town_lat town_lng using $tmp\active_stations_`i', neighbors(usaf usaf_lat usaf_lon) nearcount(4) long   //Distance from each town centroid to the 4 nearest stations active that year
			ren km_to_usaf dist_town2usaf
			unique usaf										//Stations qualifying as one of 4 nearest by year: 2013: 386, ... stable 
			summ dist_town2usaf								//Mean distance (km) by year:                      2013: 105, ... stable
			bysort town_code (dist_town2usaf): gen near_rank=_n 
			gen year=`i'
			save $tmp\town2activeStations_`i', replace
		restore
		}
	use $tmp\town2activeStations_2013, clear
	forvalues i = 2014/2018 {
		append using $tmp\town2activeStations_`i'
		erase $tmp\town2activeStations_`i'.dta
		}
	erase $tmp\town2activeStations_2013.dta
	
	expand 12
	bysort town_code usaf year: gen month = _n
	tab month year
	save $tmp\town_4nearestStation_year_month, replace
	
	use $tmp\W_station_year_month, clear
	merge 1:m usaf year month using $tmp\town_4nearestStation_year_month, keep(2 3)		//One usaf-year-month observation merged to many town_code-usaf-year-month observations for which usaf is one of the the 4 nearest stations
	erase $tmp\town_4nearestStation_year_month.dta    		//1 town_code-usaf-year-month quadruples with inactive recording (relative to 12,428,063 active quadruples); this is low given the restriction above to stations with > 270 daily readings in a year
	replace numDays_temp_YM=0 if _merge==2
	isid town_code usaf year month 							//town_code usaf year month are unique identifiers
	************************************************		
	drop if dist_town2usaf>200								//Allow stations at most 200 km away to contribute to a town's inverse-distance weighted weather measures.
	unique town_code										
	bysort town_code year month: gen num_near=_N
	forvalue  i = 1/4 {										//The number of townships with 1, 2, 3, 4 active stations(s) within 200 km (at any point in time) is 297, 1042, 1596, 40,382, respectively.
		unique town_code if num_near==`i'
		}
	************************************************
	keep  town_code usaf dist_town2usaf year month numDays_temp_YM temp prcp
	order town_code usaf dist_town2usaf year month numDays_temp_YM temp prcp
	bysort year: summ numDays_temp_YM temp prcp
	
	*Take the weighted average across weather stations using the inverse square distance as weights
	foreach var of varlist temp prcp {
		gen weight_`var' = dist_town2usaf^(-2) if `var'~=.
		*gen weight_`var' = dist_town2usaf^(-1) if `var'~=.		//Robustness using ^(-1)
		replace    `var' = `var'*weight_`var'
		}
	collapse (max) numDays_temp_YM (sum) temp prcp weight_temp weight_prcp, by(town_code year month)
	foreach var of varlist temp prcp {
		replace `var' = `var'/weight_`var'
		drop weight_`var'
		}	
	summ numDays_temp_YM

	save $tmp\W_4within200kmTown_year_month, replace
end



capture program drop W_town_describe
program define       W_town_describe
	use $tmp\W_4within200kmTown_year_month, clear
	gen yyyymm=year*100+month
	
	bysort year: summ temp prcp								//Reported in the text
end



capture program drop W_4within200kmCounty_year_month 
program define       W_4within200kmCounty_year_month
    use county_code_coordinates, clear
	
	forvalues i = 2013/2018 {								//To find the 4 nearest stations to each county among stations that are active each year
		preserve
			geonear county_code county_lat county_lng using $tmp\active_stations_`i', neighbors(usaf usaf_lat usaf_lon) nearcount(4) long   //Distance from each county centroid to the 4 nearest stations active that year
			ren km_to_usaf dist_county2usaf
			unique usaf										//Stations qualifying as one of 4 nearest by year: 2013: 389, ... stable 
			summ dist_county2usaf							//Mean distance (km) by year:                      2013: 107, ... stable
			bysort county_code (dist_county2usaf): gen near_rank=_n 
			gen year=`i'
			save $tmp\county2activeStations_`i', replace
		restore
		}
	use $tmp\county2activeStations_2013, clear
	forvalues i = 2014/2018 {
		append using $tmp\county2activeStations_`i'
		erase $tmp\county2activeStations_`i'.dta
		}
	erase $tmp\county2activeStations_2013.dta
	
	expand 12
	bysort county_code usaf year: gen month = _n
	tab month year
	save $tmp\county_4nearestStation_year_month, replace
	
	use $tmp\W_station_year_month, clear
	merge 1:m usaf year month using $tmp\county_4nearestStation_year_month, keep(2 3)		//One usaf-year-month observation merged to many county_code-usaf-year-month observations for which usaf is one of the the 4 nearest stations
	erase $tmp\county_4nearestStation_year_month.dta    	//2 county_code-usaf-year-month quadruples with inactive recording (relative to 817,630 active quadruples); this is low given the restriction above to stations with > 270 daily readings in a year
	replace numDays_temp_YM=0 if _merge==2
	isid county_code usaf year month 						//county_code usaf year month are unique identifiers
	************************************************		
	*drop if dist_county2usaf>100							//We allow stations at most 200 km away to contribute to a county's inverse-distance weighted weather measures.
	drop if dist_county2usaf>200
	unique county_code										//The number of counties with at least one year-month observation (at any point during the time period) drops from 2839 to 2832
	bysort county_code year month: gen num_near=_N
	forvalue  i = 1/4 {										//The number of counties with 1, 2, 3, 4 active stations(s) within 200 km (at any point in time) is 34, 80, 133, 2595, respectively.
		unique county_code if num_near==`i'
		}
	************************************************
	keep  county_code usaf dist_county2usaf year month numDays_temp_YM temp prcp
	order county_code usaf dist_county2usaf year month numDays_temp_YM temp prcp
	bysort year: summ numDays_temp_YM temp prcp
	
	*Take the weighted average across weather stations using the inverse square distance as weights
	foreach var of varlist temp prcp {
		gen weight_`var' = dist_county2usaf^(-2) if `var'~=.
		*gen weight_`var' = dist_county2usaf^(-1) if `var'~=.	//Robustness using ^(-1)
		replace    `var' = `var'*weight_`var'
		}
	collapse (max) numDays_temp_YM (sum) temp prcp weight_temp weight_prcp, by(county_code year month)
	foreach var of varlist temp prcp {
		replace `var' = `var'/weight_`var'
		drop weight_`var'
		}	
	summ numDays_temp_YM

	save $tmp\W_4within200kmCounty_year_month, replace
end



capture program drop W_4within200kmMonitor_year_month 
program define       W_4within200kmMonitor_year_month
    use $tmp\pm_monitor_coordinates, clear
	
	forvalues i = 2013/2018 {								//To find the 4 nearest usaf stations to each pollutant monitor among usaf stations that are active each year
		preserve
			geonear StationID StationLat StationLon using $tmp\active_stations_`i', neighbors(usaf usaf_lat usaf_lon) nearcount(4) long   //Distance from each pollutant monitor to the 4 nearest usaf stations active that year
			ren km_to_usaf dist_monitor2usaf
			unique usaf										//Stations qualifying as one of 4 nearest by year: 2013: 364, ... stable 
			summ dist_monitor2usaf							//Mean distance (km) by year:                      2013: 99,  ... stable
			bysort StationID (dist_monitor2usaf): gen near_rank=_n 
			gen year=`i'
			save $tmp\monitor2activeStations_`i', replace
		restore
		}
	use $tmp\monitor2activeStations_2013, clear
	forvalues i = 2014/2018 {
		append using $tmp\monitor2activeStations_`i'
		erase $tmp\monitor2activeStations_`i'.dta
		}
	erase $tmp\monitor2activeStations_2013.dta
	
	expand 12
	bysort StationID usaf year: gen month = _n
	tab month year
	save $tmp\monitor_4nearestStation_year_month, replace
	
	use $tmp\W_station_year_month, clear
	merge 1:m usaf year month using $tmp\monitor_4nearestStation_year_month, keep(2 3)		//One usaf-year-month observation merged to many StationID-usaf-year-month observations for which usaf is one of the the 4 nearest stations
	erase $tmp\monitor_4nearestStation_year_month.dta    	//0 StationID-usaf-year-month quadruples with inactive recording (relative to 461,088 active quadruples)
	replace numDays_temp_YM=0 if _merge==2
	isid StationID usaf year month 							//StationID usaf year month are unique identifiers
	************************************************		
	*drop if dist_monitor2usaf>100							//We allow stations at most 200 km away to contribute to a monitor's inverse-distance weighted weather measures.
	drop if dist_monitor2usaf>200
	unique StationID										
	bysort StationID year month: gen num_near=_N
	forvalue  i = 1/4 {										//The number of monitors with 1, 2, 3, 4 active stations(s) within 200 km (at any point in time) is 5, 31, 48, 1522, respectively. 
		unique StationID if num_near==`i'
		}
	************************************************
	keep  StationID usaf dist_monitor2usaf year month numDays_temp_YM temp prcp
	order StationID usaf dist_monitor2usaf year month numDays_temp_YM temp prcp
	bysort year: summ numDays_temp_YM temp prcp
	
	*Take the weighted average across weather stations using the inverse square distance as weights
	foreach var of varlist temp prcp {
		gen weight_`var' = dist_monitor2usaf^(-2) if `var'~=.
		*gen weight_`var' = dist_monitor2usaf^(-1) if `var'~=.	//Robustness using ^(-1)
		replace    `var' = `var'*weight_`var'
		}
	collapse (max) numDays_temp_YM (sum) temp prcp weight_temp weight_prcp, by(StationID year month)
	foreach var of varlist temp prcp {
		replace `var' = `var'/weight_`var'
		drop weight_`var'
		}	
	summ numDays_temp_YM

	save $tmp\W_4within200kmMonitor_year_month, replace
end



capture program drop pm_W_dist_combine
program define       pm_W_dist_combine
	*For town-level pollution analysis
	use $tmp\pm_4within50kmTown_year_month, clear
	merge 1:1 town_code year month using $tmp\W_4within200kmTown_year_month
	unique town_code if _merge==1						//No town_code-year-month triples have missing weather data
	unique town_code if _merge==2						//33,432 town_code-year-month triples have missing PM data
	replace numDays_temp_YM=0 if _merge==1
	replace numDays_pm25_YM=0 if _merge==2
	assert pm25==.            if numDays_pm25_YM==0
	assert numDays_pm25_YM==0 if pm25==.
	assert numDays_temp_YM==0 if temp==.
	assert temp==. if numDays_temp_YM==0
	drop _merge
	
	merge m:1 town_code using $tmp\town_code_heat_dist_urban, assert(2 3)	//Many town-year-month observations of environmental conditions merged to one town observation of heating policy including distance
	drop if _merge==2									//72 towns have missing PM and weather data throughout the time period, in part due to the restriction that there is an active PM monitor within 50 km 
	drop _merge
	ren dist_qinhuai_new dist_qinhuai_IZ
	ren heat_qinhuai_new heat_qinhuai_IZ
	replace dist_qinhuai_IZ = dist_qinhuai_IZ/1000
	replace dist_real       = dist_real      /1000
	label var dist_qinhuai_IZ "distance (km) from the town centroid to the Qin-Huai boundary (Ito and Zhang version)"
	label var heat_qinhuai_IZ "dummy var; north of the Qin-Huai boundary boundary (Ito and Zhang version) = 1, otherwise = 0"
	label var dist_real       "distance (km) from the town centroid to the alternative heating boundary"
	label var heat_real       "dummy var; north of the alternative heating boundary = 1, otherwise = 0"
	
	tab month year if pm25_near4==.
	tab month year if temp==.
	drop if pm25_near4==.		//Because we will collapse within subperiod (2013-2015 or 2016-2018), we want to balance out the seasons across the PM and weather data, 2013 in particular 
	
	isid town_code year month
	save $tmp\pm_W_dist_town_year_month, replace
	
	
	*For county-level pollution analysis
	use $tmp\pm_4within50kmCounty_year_month, clear
	merge 1:1 county_code year month using $tmp\W_4within200kmCounty_year_month
	unique county_code if _merge==1						//No county_code-year-month triples have missing weather data
	unique county_code if _merge==2						//2166 county_code-year-month triples have missing PM data
	replace numDays_temp_YM=0 if _merge==1
	replace numDays_pm25_YM=0 if _merge==2
	assert pm25==.            if numDays_pm25_YM==0
	assert numDays_pm25_YM==0 if pm25==.
	assert numDays_temp_YM==0 if temp==.
	assert temp==. if numDays_temp_YM==0
	drop _merge
	
	merge m:1 county_code using $tmp\county_code_heat_dist_urban, assert(2 3)	//Many county-year-month observations of environmental conditions merged to one county observation of heating policy including distance
	drop if _merge==2									//7 counties have missing PM and weather data throughout the time period, in part due to the restriction that there is an active PM monitor within 50 km
	drop _merge
	ren dist_qinhuai_new dist_qinhuai_IZ
	ren heat_qinhuai_new heat_qinhuai_IZ
	replace dist_qinhuai_IZ = dist_qinhuai_IZ/1000
	replace dist_real       = dist_real      /1000
	label var dist_qinhuai_IZ "distance (km) from the county centroid to the Qin-Huai boundary (Ito and Zhang version)"
	label var heat_qinhuai_IZ "dummy var; north of the Qin-Huai boundary (Ito and Zhang version) = 1, otherwise = 0"
	label var dist_real       "distance (km) from the county centroid to the alternative heating boundary"
	label var heat_real       "dummy var; north of the alternative heating boundary = 1, otherwise = 0"
	
	tab month year if pm25_near4==.
	tab month year if temp==.
	drop if pm25_near4==.		//Because we will collapse within subperiod (2013-2015 or 2016-2018), we want to balance out the seasons across the PM and weather data, 2013 in particular 
	
	isid county_code year month
	save $tmp\pm_W_dist_county_year_month, replace
	
	
	*For monitor-level pollution analysis
	use $tmp\active_monitors_2013, clear				//Keep observations for active monitors only (active by year, active defined as over 270 daily means in the year)
	forvalues i = 2014/2018 {
		append using $tmp\active_monitors_`i'
		}
	
	expand 12											//Jan - Dec = 12 months
	bysort StationID year: gen month = _n
	tab month year
	keep StationID year month 
	save $tmp\activeMonitor_year_month, replace
	
	use $tmp\pm_monitor_year_month, clear
	
	***Comment out from here...***
	merge 1:1 StationID year month using $tmp\activeMonitor_year_month			//One StationID-year-month observation merged to many county_code-StationID-year-month observations for which StationID is one of the the 4 nearest monitors
	unique StationID if _merge==1						//5679 StationID-year-month triples correspond to 571 less-than-active monitors; only 2390 triples without missing PM2.5
	sum pm25 if _merge==1
	drop if _merge==1
	unique StationID if _merge==2						//95 StationID-year-month triples, corresponding to 83 active monitors, have missing PM data
	replace numDays_pm25_YM=0 if _merge==2
	drop _merge
	***...to here if we extend sample to active and less-than-active monitors***
	
	erase $tmp\activeMonitor_year_month.dta
	
	drop StationLon StationLat
	merge 1:1 StationID year month using $tmp\W_4within200kmMonitor_year_month
	unique StationID if _merge==1						//7 (0 if restricted to active monitors above) StationID-year-month triples have missing weather data
	unique StationID if _merge==2						//1149 (1208 if restricted to active monitors above) StationID-year-month triples have missing PM data
	replace numDays_temp_YM=0 if _merge==1
	replace numDays_pm25_YM=0 if _merge==2
	assert pm25==.            if numDays_pm25_YM==0
	assert numDays_pm25_YM==0 if pm25==.
	assert numDays_temp_YM==0 if temp==.
	assert temp==. if numDays_temp_YM==0
	drop _merge
	
	merge m:1 StationID using $tmp\monitor_heat_dist_urban, assert(2 3)
	drop if _merge==2									//27 monitors have missing PM and weather data throughout the time period, likely because of the restriction that there is an active PM monitor within 50 km
	drop _merge
	ren dist_qinhuai_new dist_qinhuai_IZ
	ren heat_qinhuai_new heat_qinhuai_IZ
	replace dist_qinhuai_IZ = dist_qinhuai_IZ/1000
	replace dist_real       = dist_real      /1000
	label var dist_qinhuai_IZ "distance (km) from the pollutant monitor to the Qin-Huai boundary (Ito and Zhang version)"
	label var heat_qinhuai_IZ "dummy var; north of the Qin-Huai boundary (Ito and Zhang version) = 1, otherwise = 0"
	label var dist_real       "distance (km) from the pollutant monitor to the alternative heating boundary"
	label var heat_real       "dummy var; north of the alternative heating boundary = 1, otherwise = 0"
	
	tab month year if pm25==.
	tab month year if temp==.
	drop if pm25==.		//Because we will collapse within subperiod (2013-2015 or 2016-2018), we want to balance out the seasons in the PM and weather data, 2013 in particular 
	
	ren StationID monitor_code
	isid monitor_code year month
	save $tmp\pm_W_dist_monitor_year_month, replace
end



location_heat_dist_urban
pm_monitor_year_month
pm_monitor_describe
pm_4within50kmTown_year_month
pm_town_describe
pm_4within50kmCounty_year_month

W_station_year_month
W_4within200kmTown_year_month
W_town_describe
W_4within200kmCounty_year_month
W_4within200kmMonitor_year_month

pm_W_dist_combine




*Town-2010 and county-year level population from 2013 to 2018; sources are Census (town) and CDC (county)
capture program drop population_year
program define       population_year
	use population_town_2010.dta, clear							//Township-level from the 2010 Census
	isid town_code												//Cross-section of 41,446 towns

	*Missing gender breakdown, age breakdown, total population
	gen popBothGenders = pop_male + pop_female					//45 towns missing some gender breakdown
	gen popAllAges = pop_0_14 + pop_15_64 + pop_above64			//108 towns missing some age breakdown
	replace pop_total = popBothGenders if popBothGenders==popAllAges & pop_total==.		//18 towns no longer missing total population by using sum of gender or age breakdown

	*Same total and popAllAges, missing a gender population
	replace pop_female = pop_total - pop_male   if popAllAges==pop_total & popBothGenders!=pop_total & pop_female==. & pop_male!=.	//12 towns were missing female population and observed male population
	replace pop_male   = pop_total - pop_female if popAllAges==pop_total & popBothGenders!=pop_total & pop_female!=. & pop_male==.	//18 towns were missing male population and observed female population

	replace popBothGenders = pop_male + pop_female
	gen diffBothGenders = pop_total - popBothGenders
	tab diffBothGenders	// ignore +/- 10

	replace pop_male = pop_total - pop_female if town_code == 150204101 | town_code == 230206003 | town_code == 32020006 ///	//Adjust male population using total-female difference for 19 towns
		| town_code == 320203002 | town_code == 320706101 | town_code == 370283001 | town_code == 370283004 ///
		| town_code == 370283100 | town_code == 370283101 | town_code == 520222200 | town_code == 522727102 ///
		| town_code == 522727106 | town_code == 522727107 | town_code == 522727108 | town_code == 522727200 ///
		| town_code == 522727202 | town_code == 522727204 | town_code == 532930103 | town_code == 610825208 ///
		| town_code == 320202006
		
	replace pop_female = pop_total - pop_male if town_code == 150981007 | town_code == 211421109 | town_code == 220181002 ///	//Adjust female population using total-male difference for 28 towns
		| town_code == 320202002 | town_code == 320202004 | town_code == 320721101 | town_code == 320721108 ///
		| town_code == 330681103 | town_code == 330681108 | town_code == 330681109 | town_code == 330681111 ///
		| town_code == 330683106 | town_code == 350623202 | town_code == 360430502 | town_code == 360430403 ///
		| town_code == 451302210 | town_code == 522701105 | town_code == 530828213 | town_code == 530828214 ///
		| town_code == 530828217 | town_code == 530829101 | town_code == 530829205 | town_code == 530902002 ///
		| town_code == 530902102 | town_code == 530921107 | town_code == 532528209 | town_code == 610726110 ///
		| town_code == 410728105

	replace pop_total = popAllAges if town_code == 220105001 | town_code == 320202005 | town_code == 320203001 ///	//Adjust total population using age breakdown for 15 towns
		| town_code == 320902008 | town_code == 320902102 | town_code == 330183001 | town_code == 341802007 ///
		| town_code == 370281113 | town_code == 411082203 | town_code == 511324231 | town_code == 511423114 ///
		| town_code == 513335100 | town_code == 522722210 | town_code == 530828211 | town_code == 654025102 
		
	replace pop_male = pop_total - pop_female if town_code == 220105001 | town_code == 320902102 | town_code == 530828211	//Further adjust male population using total-female difference for 3 towns 
		
	replace pop_female = pop_total - pop_male if town_code == 341802007		//Further adjust female population using total-male difference for 1 town

	replace pop_male   = 22021 if town_code == 320721109	//A few more adjustments to gender breakdown and/or total
	replace pop_female = 23383 if town_code == 320721109
	
	replace pop_male   = 24385                if town_code == 320902007
	replace pop_total = pop_male + pop_female if town_code == 320902007
	
	replace pop_total  = popBothGenders       if town_code == 360430200 | town_code == 522427203 | town_code == 522427206
		
	replace pop_male = 19566                  if town_code == 150204102
	replace pop_female = pop_total - pop_male if town_code == 150204102

	drop popAllAges diffBothGenders
	
	replace pop_0_14    = pop_total - pop_15_64 - pop_above64 if pop_0_14==. & pop_15_64!=. & pop_above64!=. & pop_total!=.			//41 towns were missing pop_0_14 and observed other age groups
	replace pop_15_64   = pop_total - pop_0_14  - pop_above64 if pop_0_14!=. & pop_15_64==. & pop_above64!=. & pop_total!=.			//10 towns were missing pop_15_64 and observed other age groups
	replace pop_above64 = pop_total - pop_0_14   - pop_15_64  if pop_0_14!=. & pop_15_64!=. & pop_above64==. & pop_total!=.			//32 towns were missing pop_above64 and observed other age groups
	
	replace pop_0_14    = 0 if pop_total==pop_15_64 & pop_0_14==.																	//21 towns with no children and no elderly
	replace pop_above64 = 0 if pop_total==pop_15_64 & pop_above64==.

	gen popAllAges = pop_0_14 + pop_15_64 + pop_above64																				//4 towns still missing some age breakdown (cf. 108 above)
	gen diffAllAges = pop_total - popAllAges
	tab diffAllAges		// ignore +/- 10
	
	gen ratio_0_14 = pop_0_14/pop_total																								//Adjust child, elderly, middle-aged population based on ratios that are 'too high'
	replace pop_0_14 = pop_total - pop_15_64 - pop_above64 if diffAllAges<0 & ratio_0_14>=0.6										//29 towns with child population that is 'too high'
	replace popAllAges = pop_0_14 + pop_15_64 + pop_above64
	replace diffAllAges = pop_total - popAllAges
	
	gen ratio_above64 = pop_above64/pop_total
	replace pop_above64 = pop_total - pop_0_14 - pop_15_64 if diffAllAges<0 & ratio_above64>=1										//12 towns with elderly population that is 'too high'
	replace popAllAges = pop_0_14 + pop_15_64 + pop_above64
	replace diffAllAges = pop_total - popAllAges
	
	gen ratio_15_64 = pop_15_64/pop_total	
	replace pop_15_64 = pop_total - pop_0_14 - pop_above64 if diffAllAges<0 & ratio_15_64>=1										//41 towns with middle-aged population that is 'too high'
	replace popAllAges = pop_0_14 + pop_15_64 + pop_above64
	replace diffAllAges = pop_total - popAllAges

	replace pop_0_14    = pop_total - pop_15_64 - pop_above64 if town_code == 360103007												//A few more adjustments to age breakdown
	replace pop_15_64   = pop_total - pop_0_14 - pop_above64 if town_code == 211021107 | town_code == 230302200 ///
		| town_code == 341023103 | town_code == 341226117
	replace pop_above64 = pop_total - pop_0_14 - pop_15_64 if town_code == 320684113 | town_code == 450902005 ///
		| town_code == 511132205 | town_code == 511132222 | town_code == 621026207

	drop popBothGenders popAllAges diffAllAges ratio_0_14 ratio_above64 ratio_15_64
	summ pop_total, detail									//Around 1% of towns' population is smaller than 405, some towns even have only less than 10 people
	save $tmp/population_town_2010, replace
	
	
	use population_county_2013, clear						//County-year-level from the Chinese Center for Disease Control and Prevention; these data are proprietary and can be accessed through that organization
	append using population_county_2014 population_county_2015 population_county_2016 population_county_2017 population_county_2018, force
	ren code county_code
	isid county_code ageid year								//An observation is a county_code ageid year
	assert totalpop~=.
	tab county_code ageid if totalpop~=malepop+femalepop	//Only 3 observations (pertaining to the code for the sum of all counties?) have total population different from the sum of male and female, but the difference is within 10
	assert totalpop-malepop-femalepop<=10
	tab year ageid											//The number of counties rises gently from 3498 in 2013 to 3526 in 2018; the age breakdown is complete
	collapse (sum) pop_total=totalpop pop_male=malepop pop_female=femalepop, by(county_code year)		//Align with variable names for town population dataset
	assert pop_male==0   if pop_female==0					//0 male population <=> 0 female population
	assert pop_female==0 if pop_male==0
	assert pop_total==0  if pop_male==0						//0 total population <=> 0 male population
	assert pop_male==0   if pop_total==0
	foreach var of varlist pop_total pop_male pop_female {	//0 means the record is missing; this matters a bit because we will average over 3-year periods
		replace `var'=. if `var'==0
		}
	save $tmp/population_county_year, replace
	
	*Examine county-level population growth
	bysort year: summ pop_total if county_code~=0			//County codes include subtotals of county aggregations; 2013: 3494*1161603=4.059 bi, 2014: 4.063 bi, 2015: 4.097 bi, 2016: 4.121 bi, 2017: 4.148 bi, 2018: 4.177 bi
	bysort county_code (year): gen growthYoY=100*(pop_total/pop_total[_n-1]-1) if _n>1
	bysort year: summ growthYoY, detail						//Year-on-year growth rates across county-code has medians of 0.5% but there are some large rates on both ends of the distribution
	histogram growthYoY if growthYoY<5 & growthYoY>-5
	
	
	*Consistency between 2010 Census town vs. CDC year-varying county population cross-sections
	collapse (mean) pop_county=pop_total, by(county_code)	//Collapse CDC year-varying county population to mean over 2013-2018
	unique county_code										//3653 unique county codes
	save $tmp/population_county_meanYears, replace
	
	* Aggregate 2010 Census town population to county population
	use $tmp/population_town_2010, clear
	collapse (sum) pop_townAggr=pop_total, by(county_code_pop)
	ren county_code_pop county_code
	tempfile t
	save `t', replace
	
	*Merge aggregated 2010 Census town population into CDC county population (mean over 2013-2018)
	use $tmp/population_county_meanYears, clear
	merge 1:1 county_code using `t'							//23 county codes from 2010 Census data but not CDC data; 877 county codes from CDC data (some are sub-totals of county aggregations) but not 2010 Census data  
	keep if _merge==3
	erase $tmp/population_county_meanYears.dta
	
	replace pop_county   = pop_county/10000
	replace pop_townAggr = pop_townAggr/10000
	scatter pop_county pop_townAggr ///
		, msize(small) xtitle("County-level population aggregated from 2010 town Census" "unit: 10,000") ///
		ytitle("County-level population from the CDC (mean 2013-18)")
	scatter pop_county pop_town if pop_town < 1000 ///
		, msize(small) xtitle("County-level population aggregated from 2010 town Census" "unit: 10,000") ///
		ytitle("County-level population from the CDC (mean 2013-18)")
end



population_year




******************************************************************************************************************************************
*From here to the end, three programs prepare data to conduct a robustness test that accounts for prevailing wind direction (Fig. A10, 
*Table A15). Contact the corresponding author to obtain the NOAA data at the station-date-hour level, totaling 6.7 GB. 
*Alternatively, we are also publishing the datasets generated by these three programs, for the purpose of replicating Table A15, 
*specifically, windBlowsToB_monitor_year_month.dta and active_windStations_2013.dta to active_windStations_2018.dta.
******************************************************************************************************************************************
capture program drop wind_station_date_hour
program define       wind_station_date_hour
	*2013
	use NOAA_2013_hourly, clear					//Memory space for this year's file is twice that for years 2014 to 2018
	assert year==2013
	ren station windStation
	unique windStation							//411 NOAA wind stations
	unique name									//407 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999	
	assert wnd_direct==. if wnd_speed==9999 
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (75,935 vs. 122,519)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	*assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//(1 exception among 5968 observations) Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//396 active wind stations
	sum elevation								//Max elevation is 4701m
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2013, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2013, replace
	
	
	*2014
	use NOAA_2014_hourly, clear
	assert year==2014
	ren station windStation
	unique windStation							//409 NOAA wind stations
	unique name									//405 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999
	assert wnd_direct==. if wnd_speed==9999 
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (110,886 vs. 159,979)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//397 active wind stations
	sum elevation
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2014, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2014, replace
	
	
	*2015
	use NOAA_2015_hourly, clear
	assert year==2015
	ren station windStation
	unique windStation							//407 NOAA wind stations
	unique name									//403 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (109,399 vs. 159,623)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//396 active wind stations
	sum elevation
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2015, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2015, replace
	
	
	*2016
	use NOAA_2016_hourly, clear
	assert year==2016
	ren station windStation
	unique windStation							//409 NOAA wind stations
	unique name									//405 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999
	assert wnd_direct==. if wnd_speed==9999 
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (116,695 vs. 169,528)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	*assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//(1 exception among 9917 observations) Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//395 active wind stations
	sum elevation
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2016, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2016, replace
	
	
	*2017
	use NOAA_2017_hourly, clear
	assert year==2017
	ren station windStation
	unique windStation							//406 NOAA wind stations
	unique name									//402 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999
	assert wnd_direct==. if wnd_speed==9999 
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (120,305 vs. 179,907)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//394 active wind stations
	sum elevation
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2017, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2017, replace
	
	
	*2018
	use NOAA_2018_hourly, clear
	assert year==2018
	ren station windStation
	unique windStation							//407 NOAA wind stations
	unique name									//403 names
	foreach var in source report_type call_sign quality_control {
		tab `var'
		}
	replace wnd_direct=. if wnd_direct==999
	assert wnd_direct==. if wnd_speed==9999 
	replace wnd_speed=.  if wnd_speed==9999
	tab wnd_type								//9=missing, C=calm, N=normal, V=variable
	summ wnd_direct wnd_speed if wnd_type=="9"	//Wind direction always missing
	summ wnd_direct wnd_speed if wnd_type=="C"	//Wind direction always missing
	summ wnd_direct if wnd_type=="V"			//Wind direction occasionally missing (117,914 vs. 173,566)
	tab wnd_direct_quality						//1=Passed all quality control checks, 5=Passed all quality control checks, data originate from an NCEI data source, 9=Passed gross limits check if element is present
	assert wnd_direct==. if wnd_direct_quality==9	//Always missing
	assert wnd_direct==360 if wnd_speed==0 & wnd_type=="N"		//Replace wind direction to missing
	count if wnd_direct==360 & wnd_speed==0 & wnd_type=="N"
	tab wnd_direct if wnd_speed==0
	
	keep windStation latitude longitude elevation name date hour wnd_direct wnd_direct_quality wnd_type wnd_speed wnd_speed_quality
	foreach var in latitude longitude elevation {			//Wind station characteristics are invariant within windStation
		bys windStation: assert `var'[_n]==`var'[_n-1] if _n>1
		}
	
	tab hour
	tab hour if wnd_direct~=.					//Main hours of measurement: 0, 3, 6, 9, 12, 15 (more missing here), 18, 21
	keep if hour==0|hour==3|hour==6|hour==9|hour==12|hour==15|hour==18|hour==21
	
	bysort windStation date hour: gen duplicates=_N
	tab duplicates
	count if wnd_direct==.&duplicates>1			//Some of these duplicates have missing wind direction
	
	bysort windStation date hour (wnd_direct_quality): keep if _n==1	//Keep the first non-missing wind direction record (favoring quality control 1 over 9=>missing) within windStation-date-hour; taking the mean of an angle may not make sense (e.g., 10 and 350 is not 180=S)
	isid windStation date hour
	
	bysort windStation: gen obs=_N				//365 days/year * 8 (3-hour) measurements/day = 2920 (3-hour) measurements/year
	keep if obs>2160							//We consider a windStation-year active if the number of 3-hour readings exceeds 270*8
	unique windStation							//393 active wind stations
	sum elevation
	
	keep windStation latitude longitude date hour wnd_direct wnd_type
	save $tmp\wind_windStation_date_hour_2018, replace
	bysort windStation latitude longitude: keep if _n==1
	keep windStation latitude longitude
	save $tmp\active_windStations_2018, replace
end	



capture program drop monitor_boundary_bearing
program define       monitor_boundary_bearing
    use $tmp\pm_monitor_coordinates, clear
	geonear StationID StationLat StationLon using QinHuaiBoundary_coordinates_every500m, neighbors(ID lat_84 lng_84) nearcount(1) long   		//For each pollutant monitor (with non-missing coordinates), find the coordinates of the nearest point on the Qin-Huai boundary
	save $tmp\monitor2nearestQH, replace
	
	use $tmp\pm_monitor_coordinates, clear
	geonear StationID StationLat StationLon using AlternativeBoundary_coordinates_every500m, neighbors(ID lat_84 lng_84) nearcount(1) long   	//For each pollutant monitor (with non-missing coordinates), find the coordinates of the nearest point on the alternative boundary
	save $tmp\monitor2nearestAlt, replace
	
	use $tmp\pm_monitor_coordinates, clear
	merge 1:1 StationID using $tmp\monitor2nearestQH, assert(1 3) nogen		//Merges in the identifier (ID) and distance of the nearest point on the Qin-Huai boundary
	merge m:1 ID using QinHuaiBoundary_coordinates_every500m, keep(1 3) nogen	keepusing(lng_84 lat_84)		//Merges in the coordinates of these nearest points on the Qin-Huai boundary
	renvars km_to_ID lng_84 lat_84 \ dist_QH lng_QH lat_QH
	drop ID										//Drop; this is an identifier of the nearest point on the Qin-Huai boundary
	merge 1:1 StationID using $tmp\monitor2nearestAlt, assert(1 3) nogen	//Merges in the identifier (ID) and distance of the nearest point on the alternative boundary
	merge m:1 ID using AlternativeBoundary_coordinates_every500m, keep(1 3) nogen	keepusing(lng_84 lat_84)	//Merges in the coordinates of these nearest points on the alternative boundary
	renvars km_to_ID lng_84 lat_84 \ dist_Alt lng_Alt lat_Alt
	drop ID										//Drop; this is an identifier of the nearest point on the alternative boundary
	erase $tmp\monitor2nearestQH.dta
	erase $tmp\monitor2nearestAlt.dta
	
	*Calculate the initial bearing ("forward azimuth") from monitor (location 1) to nearest point on the Qin-Huai boundary (location 2). Varphi and lambda are in radians, not degrees. delta_lambda is the difference in longitude. Bearing is in radians
	*http://www.movable-type.co.uk/scripts/latlong.html
	gen varphi1=StationLat * _pi / 180			//latitude of location 1
	gen lambda1=StationLon * _pi / 180			//longitude of location 1
	
	gen varphi2=lat_QH     * _pi / 180			//latitude of location 2			
	gen lambda2=lng_QH     * _pi / 180			//longitude of location 2
	gen delta_lambda_21=lambda2-lambda1
	gen bearing_rad_21=atan2(sin(delta_lambda_21)*cos(varphi2),cos(varphi1)*sin(varphi2)-sin(varphi1)*cos(varphi2)*cos(delta_lambda_21))
	gen bearing_QH=bearing_rad_21 * 180 / _pi
	replace bearing_QH = bearing_QH*(bearing_QH>=0) + (bearing_QH+360)*(bearing_QH<0)
	
	*Similarly, calculate the initial bearing ("forward azimuth") from monitor (location 1) to nearest point on the alternative boundary (location 3).
	gen varphi3=lat_Alt    * _pi / 180			//latitude of location 3			
	gen lambda3=lng_Alt    * _pi / 180			//longitude of location 3
	gen delta_lambda_31=lambda3-lambda1
	gen bearing_rad_31=atan2(sin(delta_lambda_31)*cos(varphi3),cos(varphi1)*sin(varphi3)-sin(varphi1)*cos(varphi3)*cos(delta_lambda_31))
	gen bearing_Alt=bearing_rad_31 * 180 / _pi
	replace bearing_Alt = bearing_Alt*(bearing_Alt>=0) + (bearing_Alt+360)*(bearing_Alt<0)
	
	drop varphi1 lambda1 
	drop varphi2 lambda2 delta_lambda_21 bearing_rad_21
	drop varphi3 lambda3 delta_lambda_31 bearing_rad_31
	drop StationLat StationLon
	
	merge 1:1 StationID using $tmp\monitor_heat_dist_urban, keep(3) nogen	//27 monitors in $tmp\monitor_heat_dist_urban are not in $tmp\pm_monitor_coordinates
	scatter dist_QH dist_qinhuai_new
	scatter dist_Alt dist_real
	histogram bearing_QH if heat_qinhuai_new==1		//Bearing of monitors north of the Qin-Huai boundary to the nearest point on that boundary has density around 180 degrees from north
	histogram bearing_QH if heat_qinhuai_new==0		//Bearing of monitors south of the Qin-Huai boundary to the nearest point on that boundary has density just over 0 degree or just under 360 degrees from north
	histogram bearing_Alt if heat_real==1			//Bearing of monitors north of the alternative boundary to the nearest point on that boundary has density around 180 degrees from north
	histogram bearing_Alt if heat_real==0			//Bearing of monitors south of the alternative boundary to the nearest point on that boundary has density just over 0 degree or just under 360 degrees from north
	drop StationLat StationLon heat_real heat_qinhuai_new dist_real dist_qinhuai_new urban
	
	drop dist_QH lng_QH lat_QH dist_Alt lng_Alt lat_Alt		//Keep only the bearings from the monitor to the nearest point on either boundary
	save $tmp\monitor2nearestQH_Alt, replace
end
	
	

capture program drop wind_1within200kmMonitor_year
program define       wind_1within200kmMonitor_year
	use $tmp\pm_monitor_coordinates, clear
	
	forvalues i = 2013/2018 {				//To find the 1 nearest wind station to each pollutant monitor among wind stations that are active each year
		preserve
			geonear StationID StationLat StationLon using $tmp\active_windStations_`i', neighbors(windStation latitude longitude) nearcount(1) long   //Distance from each pollutant monitor to the 1 nearest wind station active that year
			ren km_to_windStation dist_monitor2windStation
			unique windStation				//Wind stations qualifying as the 1 nearest by year: 2013: 254, ... stable 
			summ dist_monitor2windStation	//Mean [max] distance (km) by year:                  2013: 37 [141], ... stable
			assert dist_monitor2windStation<=200				//Each qualifying 1 nearest wind station active that year is always within 200 km of the monitor to which it is nearest
			drop dist_monitor2windStation
			gen year=`i'
			isid StationID										//One observation by monitor, mapping into the 1 nearest wind station active that year
			bysort windStation: gen tempMonitorID=_n
			isid windStation tempMonitorID						//Alternatively, one observation by wind station-tempMonitorID
			save $tmp\monitor2activeWindStation_`i', replace
			bysort windStation (tempMonitorID): keep if _n==_N
			ren tempMonitorID noMonitors
			drop StationID
			isid windStation									 //One observation by assigned 1 nearest wind station active that year
			save $tmp\noMonitorsByActiveWindStation_`i', replace
		restore
		}
	
	forvalues i = 2013/2018 {									//Merging data in this loop proved tricky to get right, thus the isid commands around here!
		use $tmp\wind_windStation_date_hour_`i', clear			//Recall these data vary by windStation-date-(3-)hour within a year
		merge m:1 windStation using $tmp\noMonitorsByActiveWindStation_`i', assert(1 3) keep(3) nogen	//Some wind stations are not the 1 nearest to any monitor that year, so of no use
		erase $tmp\noMonitorsByActiveWindStation_`i'.dta
		isid windStation date hour
		expand noMonitors
		bysort windStation date hour: gen tempMonitorID=_n
		merge m:1 windStation tempMonitorID using $tmp\monitor2activeWindStation_`i', assert(3) nogen
		erase $tmp\monitor2activeWindStation_`i'.dta
		isid StationID                 date hour				//These alternative variables uniquely identify the data
		isid windStation tempMonitorID date hour
		drop noMonitors tempMonitorID
		drop windStation latitude longitude
		tab hour
		assert year==`i'
		save $tmp\wind_monitor_date_hour_`i', replace
		erase $tmp\wind_windStation_date_hour_`i'.dta			//These files contain wind direction by monitor-date-hour within each year
		}
	
	use $tmp\wind_monitor_date_hour_2013, clear					//Append wind direction by monitor-date-hour over years 2013-2018
	forvalues i = 2014/2018 {
		append using $tmp\wind_monitor_date_hour_`i'
		erase $tmp\wind_monitor_date_hour_`i'.dta
		}
	erase $tmp\wind_monitor_date_hour_2013.dta
	assert wnd_direct>0 & wnd_direct<=360 if wnd_direct~=.		//North is 360 degrees (not 0 degree)
	tab wnd_type						//Calm wind accounts for ~5% of monitor-date-(3-)hour observations
	tab year wnd_type					//Not too different year by year
	bysort year: sum wnd_direct
	
	merge m:1 StationID using $tmp\monitor2nearestQH_Alt, assert(3) nogen		//Many monitor-date-hour observations of wind direction merged to one monitor observation of bearings to the Qin-Huai boundary and to the alternative boundary 
	erase $tmp\monitor2nearestQH_Alt.dta
	assert bearing_QH>0&bearing_QH<360&bearing_Alt>0&bearing_Alt<360 			//It so happens that there is no bearing of monitor to a boundary that is exactly north
	
	*Obtain the difference in wind direction ("degrees from north") and the bearing from the monitor to the nearest point on either boundary
	gen bearingQH_wdir  = abs(bearing_QH -wnd_direct)*(abs(bearing_QH -wnd_direct)<=180) + (360-abs(bearing_QH -wnd_direct))*(abs(bearing_QH -wnd_direct)>180)
	gen bearingAlt_wdir = abs(bearing_Alt-wnd_direct)*(abs(bearing_Alt-wnd_direct)<=180) + (360-abs(bearing_Alt-wnd_direct))*(abs(bearing_Alt-wnd_direct)>180)
	assert bearingQH_wdir<=180  if wnd_direct~=.
	assert bearingAlt_wdir<=180 if wnd_direct~=.
	
	*Dummy for wind blowing toward boundary in that date-hour, or calm wind which implies missing wind direction: 
	*There is less "policy" mixing for these monitor-date-hour instances, either because the monitor is upwind of the boundary (and thus downwind of locations with the same policy state) or because the wind is not blowing
	*Qin-Huai boundary 
	gen windBlowsToQH_or_calm = (bearingQH_wdir<=90 | wnd_type=="C")
	tab windBlowsToQH_or_calm wnd_type						//Calm => windBlowsToQH_or_calm=1 
	bysort windBlowsToQH_or_calm: sum bearingQH_wdir		//windBlowsToQH_or_calm=1 => bearingQH_wdir<=90 if bearingQH_wdir~=. and windBlowsToQH_or_calm=0 => bearingQH_wdir>90 if bearingQH_wdir~=.
	*Alternative boundary
	gen windBlowsToAlt_or_calm = (bearingAlt_wdir<=90 | wnd_type=="C")
	tab windBlowsToAlt_or_calm wnd_type						//Calm => windBlowsToAlt_or_calm=1 
	bysort windBlowsToAlt_or_calm: sum bearingAlt_wdir		//windBlowsToAlt_or_calm=1 => bearingAlt_wdir<=90 if bearingAlt_wdir~=. and windBlowsToAlt_or_calm=0 => bearingAlt_wdir>90 if bearingAlt_wdir~=.
	
	summ windBlowsToQH_or_calm windBlowsToAlt_or_calm		//~55% of monitor-date-(3-)hour observations in the case of either boundary
	
	gen month=month(date)
	collapse windBlowsToQH_or_calm windBlowsToAlt_or_calm, by(StationID year month)
	ren StationID monitor_code
	isid monitor_code year month
	save $tmp\windBlowsToB_monitor_year_month, replace
	
	bysort year: sum windBlowsToQH_or_calm windBlowsToAlt_or_calm		//Not too different year by year
	histogram windBlowsToQH_or_calm				//This is the proportion of date-(3-)hour observations over monitor by year-month for which wind blows toward boundary or the wind is calm (less mixing from the other side of the boundary)
	histogram windBlowsToAlt_or_calm
	collapse windBlowsToQH_or_calm windBlowsToAlt_or_calm, by(monitor_code)	//Means by monitor over 6 years: take p25 = .4841187 Qin-Huai and .4768546 Alternative as regression subsample cutoff
	summ windBlowsToQH_or_calm windBlowsToAlt_or_calm, detail
end


	
*wind_station_date_hour					//To run, contact the corresponding author to obtain the NOAA data at the station-date-hour level, totaling 6.7 GB
monitor_boundary_bearing
*wind_1within200kmMonitor_year			//This one takes a little longer with as much as 25m obs at some point (here we did not loop over years but tackled all years in one go)



*From here to the end, three programs prepare data to conduct a robustness test that accounts for prevailing wind direction (Fig. A10, 
*Table A15). C. 
*Alternatively, we are also publishing the datasets generated by these three programs, for the purpose of replicating Table A15, 
*specifically, windBlowsToB_monitor_year_month.dta and active_windStations_2013.dta to active_windStations_2018.dta.
